Load the data from the buckets

# this GCP bucket path was removed for security reasons, see the repo's readme
gcp_bucket = "ADDPATH"
bead_counts = dl_read_gcp(sprintf("%s/bead_counts/pass1b-06_immunoassay_bead-counts.txt", gcp_bucket), 
                          OUTDIR=tmp_dir, sep = "\t")
# sample metadata should be obtained from the MoTrPAC data portal
metadata = dl_read_gcp(sprintf("%s/sample_metadata_20210803.csv", gcp_bucket), 
                          OUTDIR=tmp_dir, sep = ",")

# name of different panels
unique(bead_counts[,panel_name])
## [1] "rat-metabolic" "rat-adipokine" "rat-mag27plex" "rat-myokine"  
## [5] "rat-pituitary"
mfi_list = list()
for(mfi_file in system(sprintf("gsutil ls %s/mfi/", gcp_bucket), intern=T)){
  mfi = dl_read_gcp(mfi_file, OUTDIR=tmp_dir, sep='\t')
  panel = gsub(".*_mfi_|\\.txt", "", mfi_file) 
  mfi_list[[panel]] = mfi
  writeLines(panel)
}
## ADIPONECTIN
## SERPIN-E
## rat-mag27plex
## rat-metabolic
## rat-myokine
## rat-pituitary

Failed measurements with low bead count

# all of MoTrPAC samples' vial labels start with 90, rest are reference samples or standards.
metadata_experiment_samples = metadata[grepl("^90",vial_label)]
metadata_experiment_samples[,group := ifelse(intervention=='control', 'control', sacrifice_time)]

bead_counts_experiment_samples = bead_counts[grepl("^90",vial_label) & !grepl("CHEX[1,2,3]",feature_ID)]
bead_counts_metadata = merge(bead_counts_experiment_samples, metadata_experiment_samples,
                             by=c('vial_label','plate_id','luminex_sample_name','panel_name'))

flagged = bead_counts_metadata[bead_count < 20]
dim(flagged)
## [1] 182  23
cat("flagged: low bead count:\n")
## flagged: low bead count:
table(flagged[,panel_name],flagged[,tissue_code])
##                
##                 t55-gastrocnemius t61-colon t62-spleen t66-lung
##   rat-mag27plex                 2       152          8       10
##   rat-metabolic                 4         0          0        0
##   rat-myokine                   2         0          0        0
##                
##                 t70-white-adipose
##   rat-mag27plex                 1
##   rat-metabolic                 2
##   rat-myokine                   1
cat("\ntotal:\n")
## 
## total:
table(bead_counts_metadata[,panel_name],bead_counts_metadata[,tissue_code])
##                
##                 t31-plasma t52-hippocampus t53-cortex t55-gastrocnemius
##   rat-adipokine        120               0          0                90
##   rat-mag27plex        840             840        840               840
##   rat-metabolic        390               0          0               390
##   rat-myokine          390             390        390               390
##   rat-pituitary        240             240        240                 0
##                
##                 t56-vastus-lateralis t58-heart t59-kidney t60-adrenal t61-colon
##   rat-adipokine                    0         0          0           0         0
##   rat-mag27plex                  840       840        840         840       840
##   rat-metabolic                    0         0          0         390       390
##   rat-myokine                    390       390          0           0         0
##   rat-pituitary                    0         0          0         240         0
##                
##                 t62-spleen t63-testes t64-ovaries t66-lung t67-small-intestine
##   rat-adipokine          0          0           0        0                   0
##   rat-mag27plex        840        420         420      840                 840
##   rat-metabolic          0          0           0        0                 390
##   rat-myokine            0        195         195        0                   0
##   rat-pituitary          0        120         120        0                   0
##                
##                 t68-liver t69-brown-adipose t70-white-adipose
##   rat-adipokine         0                90                90
##   rat-mag27plex       840               840               840
##   rat-metabolic       390               390               390
##   rat-myokine           0               390               390
##   rat-pituitary         0                 0                 0
flagged_table = data.table(table(flagged[,panel_name],flagged[,tissue_code]))
# 152 out of 840 (18%) t61-colon measurements (including CHEX4) in rat-mag27plex have a low bead count.

ggplot(flagged_table, aes(x=V1, y=N, fill=V2)) +
  geom_bar(stat = 'identity', colour='black') +
  theme_classic() +
  scale_fill_manual(values=tissue_cols) +
  theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5, colour='black')) +
  labs(x='LUMINEX panel', y='N flagged wells', fill='Tissue', title='Number of wells with bead count < 20') 

Preprocess MFI

pre_process_mfi = function(mfi_dt, metadata){
  mfi_experiment_samples = mfi_dt[grepl("^90",vial_label)]
  mfi_experiment_samples[,c("CHEX1", "CHEX2", "CHEX3") := NULL]

  # take log2 of mfi values before merging
  metadata_cols = c('vial_label','plate_id','luminex_sample_name','panel_name')
  to_log = colnames(mfi_experiment_samples)[!colnames(mfi_experiment_samples) %in% metadata_cols]
  mfi_experiment_samples[, (to_log) := lapply(.SD, log), .SDcols=to_log]
  
  # merge with metadata
  mfi_meta = merge(metadata, mfi_experiment_samples, by=c('panel_name','plate_id','luminex_sample_name','vial_label'))
  assert(nrow(mfi_meta) == nrow(mfi_experiment_samples))

  # flag any CHEX4 values > 100
  flagged = mfi_meta[CHEX4 > log2(100)]
  if(nrow(flagged)>0){
    cat("flagged: CHEX4 values > 100:\n")
    print(flagged[,c(metadata_cols, 'tissue_code'), with=F])
  }
  
  return(mfi_meta)
}

processed_mfi_list = list()
for(panel in names(mfi_list)){
  writeLines(panel)
  mfi = mfi_list[[panel]]
  processed_mfi_list[[panel]] = pre_process_mfi(mfi, metadata_experiment_samples)
}
## ADIPONECTIN
## SERPIN-E
## rat-mag27plex
## rat-metabolic
## rat-myokine
## rat-pituitary

Correlate analytes measured on multiple panels

#proteins measured by more than 1 panel
LEPTIN <- c("rat-mag27plex", "rat-metabolic")
MCP1 <- c("rat-mag27plex", "rat-metabolic")
IL6 <- c("rat-mag27plex", "rat-metabolic", "rat-myokine")
FRACTALKINE <- c("rat-mag27plex", "rat-myokine")
BDNF <- c("rat-pituitary", "rat-myokine")

common_analytes <- list(LEPTIN,IL6,MCP1,FRACTALKINE,BDNF)
names(common_analytes) <- c("LEPTIN","IL6","MCP1","FRACTALKINE","BDNF")

combine_diff_assays <- function(list_name, assay_name1, assay_name2, analyte){
  combined_assays <- merge(list_name[[assay_name1]], 
                           list_name[[assay_name2]][,c("vial_label","tissue_code","group",analyte),with=F], 
                           by=c('vial_label','tissue_code','group'), 
                           suffixes=c( paste("",assay_name1,sep="_"), paste("",assay_name2,sep="_")))
  combined_assays_sorted <- combined_assays[order(tissue_code, group)]
  
  return(combined_assays_sorted)
}

# find correlation for a a given analyte from two different assays (from common tissues)
find_corr <- function(list_name, assay_name1, assay_name2, analyte){
  combined_assay <- combine_diff_assays(list_name, assay_name1, assay_name2, analyte)
  
  analyte_range <- range(c(combined_assay[,get(paste(analyte,assay_name1,sep="_"))],
                           combined_assay[,get(paste(analyte,assay_name2,sep="_"))]), na.rm=T) # try using finite=F
  analyte_range[1] <- floor(analyte_range[1])
  analyte_range[2] <- ceiling(analyte_range[2])
  
  correlation = cor(combined_assay[,get(paste(analyte,assay_name1,sep="_"))], 
                    combined_assay[,get(paste(analyte,assay_name2,sep="_"))],method="sp",use="complete.obs")
  print(correlation)
  
  p <- ggplot(combined_assay, aes(x=get(paste(analyte,assay_name1,sep="_")),
                                  y=get(paste(analyte,assay_name2,sep="_")), 
                                  col=tissue_code, 
                                  shape=group)) +
    geom_point() + 
    geom_abline() + 
    coord_fixed(xlim=analyte_range, ylim=analyte_range) +
    labs(x = assay_name1, y = assay_name2, title = paste(analyte,format(correlation,digits=2))) + 
    scale_color_manual(values=tissue_cols) + 
    theme_bw()
  print(p)
}

for(analyte in 1:length(common_analytes))
{
  print(names(common_analytes)[[analyte]])
  l <- length(common_analytes[[analyte]])

  for(i in 1:(l-1))
  {
    for(j in (i+1):l)
    {
      print(c(common_analytes[[analyte]][i],common_analytes[[analyte]][j]))
      find_corr(processed_mfi_list,common_analytes[[analyte]][i],common_analytes[[analyte]][j],names(common_analytes)[[analyte]])
    }
  }
  cat("\n")
}
## [1] "LEPTIN"
## [1] "rat-mag27plex" "rat-metabolic"
## [1] 0.9468605
## Warning: Removed 21 rows containing missing values (geom_point).

## 
## [1] "IL6"
## [1] "rat-mag27plex" "rat-metabolic"
## [1] 0.7380667
## Warning: Removed 2 rows containing missing values (geom_point).

## [1] "rat-mag27plex" "rat-myokine"  
## [1] 0.6786387

## [1] "rat-metabolic" "rat-myokine"  
## [1] 0.5822081

## 
## [1] "MCP1"
## [1] "rat-mag27plex" "rat-metabolic"
## [1] 0.4309041
## Warning: Removed 5 rows containing missing values (geom_point).

## 
## [1] "FRACTALKINE"
## [1] "rat-mag27plex" "rat-myokine"  
## [1] 0.8833702

## 
## [1] "BDNF"
## [1] "rat-pituitary" "rat-myokine"  
## [1] 0.8566376

PCA

replace_missing <- function(x){
  x[is.na(x)] = mean(x, na.rm=T)
  return(x)
}

calculate_pca <- function(list_name, assay_name){
  # everything before and including "group"
  dt = copy(list_name[[assay_name]])
  non_mfi_cols = colnames(dt)[1:which(colnames(dt)=='group')]
  dt[,c(non_mfi_cols, 'CHEX4') := NULL]
  # set missing values to mean 
  dt = dt[,lapply(.SD, replace_missing)]
  
  pca = prcomp(dt, scale.=T, center=T)
  if(dim(pca$x)[2] < 2){
    message(sprintf("Panel %s does not have enough analytes to plot 2 PCs.", assay_name))
    return()
  }
  df = data.frame(
    PC1 = pca$x[,1],PC2 = pca$x[,2],
    group = list_name[[assay_name]][,group],
    sex = list_name[[assay_name]][,sex],
    tissue = list_name[[assay_name]][,tissue_code],
    stringsAsFactors = F
  )
  p = ggplot(df) + 
    geom_point(aes(x=PC1, y=PC2,col=tissue, shape=group),size=3) +
    scale_color_manual(values=tissue_cols) + 
    ggtitle(assay_name) +
    theme_bw()
  plot(p)
}

for(panel in names(processed_mfi_list)){
  print(panel)
  calculate_pca(processed_mfi_list, panel)
}
## [1] "ADIPONECTIN"
## Panel ADIPONECTIN does not have enough analytes to plot 2 PCs.
## [1] "SERPIN-E"
## Panel SERPIN-E does not have enough analytes to plot 2 PCs.
## [1] "rat-mag27plex"

## [1] "rat-metabolic"

## [1] "rat-myokine"

## [1] "rat-pituitary"

# PCA for each tissue
calculate_pca_tissue <- function(list_name, assay_name){
  tissues <- unique(list_name[[assay_name]][,tissue_code])

  for(t in tissues)
  {
    dt_tissue = list_name[[assay_name]][tissue_code==t]
    mfi_cols = colnames(dt_tissue)[(which(colnames(dt_tissue)=='group')+1):ncol(dt_tissue)]
    mfi_cols = mfi_cols[!mfi_cols == 'CHEX4']
    # set missing values to mean 
    dt_tissue = dt_tissue[,(mfi_cols) := lapply(.SD, replace_missing), .SDcols = mfi_cols]
    
    if(length(mfi_cols) < 3){
      message(sprintf("Panel %s does not have enough analytes to plot 2 PCs.", assay_name))
      return()
    }
    
    pca = prcomp(dt_tissue[,mfi_cols,with=F], scale.=T, center=T)
    df = data.frame(
      PC1 = pca$x[,1],PC2 = pca$x[,2],
      group = dt_tissue[,"group"],
      sex = dt_tissue[,"sex"],
      tissue = dt_tissue[,"tissue_code"],
      stringsAsFactors = F
    )
    p = ggplot(df) + 
      geom_point(aes(x=PC1, y=PC2,fill=group, shape=sex),size=3, colour='black') +
      scale_fill_manual(values=group_cols) + 
      labs(title=assay_name, subtitle=t) +
      theme_classic() +
      guides(fill=guide_legend(override.aes = list(shape=21))) +
      scale_shape_manual(values=c(female=21, male=24))
    plot(p)
  }
}

for(panel in names(processed_mfi_list)){
  calculate_pca_tissue(processed_mfi_list, panel)
}
## Panel ADIPONECTIN does not have enough analytes to plot 2 PCs.
## Panel SERPIN-E does not have enough analytes to plot 2 PCs.

Impute missing values

Missing values are due to bead counts < 20, which is not correlated with actual analyte abundance.

# first get data into slightly different format
name_to_vl = unique(metadata[,.(vial_label, luminex_sample_name, tissue_code, bid, sex, intervention, sacrifice_time)])
# remove reference standards from this step. they are in duplicate so it's hard to merge them with the other data
name_to_vl = name_to_vl[!grepl("^8", vial_label)]
name_to_vl[, group := ifelse(intervention=='control','control',sacrifice_time)]
name_to_vl[,c('intervention','sacrifice_time'):=NULL]

data_list = list()
for(panel in names(mfi_list)){
  mfi = copy(mfi_list[[panel]])
  
  need_to_remove = c("CHEX1", "CHEX2", "CHEX3")
  mfi[,(need_to_remove) := NULL]
  # merge with viallabel
  # this removes standard curve, background, and ref stds 
  p = merge(name_to_vl, mfi, by=c("vial_label", "luminex_sample_name"))
  
  # split metadata and intensities
  meta = p[,.(bid, vial_label, luminex_sample_name, tissue_code, panel_name, plate_id, sex, group, CHEX4)]
  mfi = p
  mfi[,c("tissue_code", "plate_id", "panel_name", "CHEX4", "sex", "group", "bid") := NULL]
  
  # convert to data.frame for portability
  mfi = as.data.frame(mfi, check.names=F)
  meta = data.frame(meta, stringsAsFactors=F)
  if(any(duplicated(mfi$vial_label))){
    stop(sprintf("Duplicate vial labels: %s, %s", panel, unique(meta$tissue_code[duplicated(meta$vial_label)])))
  }
  rownames(mfi) = mfi$vial_label
  mfi$luminex_sample_name = NULL
  mfi$vial_label = NULL
  rownames(meta) = meta$vial_label
  
  # MFI and meta$CHEX4 should be log2-transformed before differential analysis
  mfi = data.frame(lapply(mfi, function(x) log2(as.numeric(x))),check.names=F, row.names = rownames(mfi))
  meta$log2_CHEX4 = log2(as.numeric(meta$CHEX4))
  
  # differential analysis should be performed independently for each tissue. these data frames include all tissues for an assay 
  data_list[[panel]] = list(log2_mfi=mfi,
                            metadata=meta)
  
}

For each tissue within each panel:
- Remove samples with >50% missing values (i.e. bead counts < 20)
- Remove features where more than 2 values for a single group are NA
- Impute missing values with KNN

imputed_data = list()
removed_samples = list()
i=1
for(panel in names(data_list)){
  
  data = data_list[[panel]]$log2_mfi
  data$dummy = 1 # can't figure out how to return a df when subsetting rows of a df with a single col
  meta = data.table(data_list[[panel]]$metadata)
  
  imputed_data[[panel]] = list()
  
  for(tissue in unique(meta[,tissue_code])){
    
    filtered=F
    imputed_data[[panel]][[tissue]] = list()
    
    curr_meta = meta[tissue_code==tissue]
    curr_meta[,group := factor(group, levels=c('control','1w','2w','4w','8w'))]
    curr_meta = curr_meta[order(sex, group)] # sort
    curr_data = data[as.character(curr_meta[,vial_label]),] # subset data
    curr_data$dummy = NULL
    
    n_missing = sum(apply(curr_data, c(1,2), is.na))
    writeLines(sprintf("N missing in %s %s: %s", panel, tissue, n_missing))
    
    if(n_missing == 0){
      imputed_data[[panel]][[tissue]]$mfi_log2_filt_imputed = curr_data
      new_meta = as.data.frame(curr_meta, row.names=curr_meta[,vial_label]) # why isn't row.names working?
      rownames(new_meta) = new_meta$vial_label
      imputed_data[[panel]][[tissue]]$metadata = new_meta
      next
    }
    
    meta_df = data.frame(curr_meta[,.(vial_label, sex, group)])
    rownames(meta_df) = meta_df$vial_label
    meta_df$vial_label = NULL
    colors = list(sex = sex_cols[names(sex_cols) %in% meta_df$sex], 
                  group = group_cols[names(group_cols) %in% meta_df$group])
    
    pheatmap(t(curr_data),
             cluster_rows=F,
             cluster_cols=F,
             scale='row',
             na_col = 'black',
             main = sprintf('%s %s with missing values', panel, tissue),
             annotation_col = meta_df,
             annotation_colors = colors)
    
    # exclude samples with more than 50% missing values
    bin_missing = is.na(curr_data)
    fraction_missing = rowSums(bin_missing)/ncol(bin_missing)
    need_to_remove = names(fraction_missing)[fraction_missing > 0.5]
    if(length(need_to_remove) > 0){
      # writeLines(sprintf("The following samples in %s %s have more than 50%% missing values and will be removed: %s",
      #                    panel, tissue, paste0(need_to_remove, collapse=', ')))
      missing_dt = curr_meta[vial_label %in% need_to_remove, .(vial_label, tissue_code, panel_name)]
      missing_dt[,fraction_missing := fraction_missing[match(missing_dt[,vial_label], names(fraction_missing))]]
      removed_samples[[i]] = missing_dt
      i=i+1
      curr_data = curr_data[!rownames(curr_data) %in% need_to_remove,]
      curr_meta = curr_meta[!vial_label %in% need_to_remove]
      filtered=T
    }
    
    # exclude features where more than 2 values for a single group are NA 
    data_melt = reshape2::melt(as.matrix(curr_data))
    colnames(data_melt) =  c('vial_label','feature_ID','value')
    missing_data = data_melt[is.na(data_melt$value),]
    # merge with meta
    missing_data = data.table(merge(missing_data, curr_meta[,.(vial_label, sex, group)], by='vial_label'))
    # for a given feature, are there more than 2 missing values from a single group?
    missing_data[,sex_group := paste0(sex, '_', group)]
    sums = missing_data[,list(n_missing_per_group = sum(is.na(value))),
                 by=.(feature_ID, sex_group)]
    dont_impute = as.character(unique(sums[n_missing_per_group > 1, feature_ID]))
    if(length(dont_impute) > 0){
      sums2 = missing_data[feature_ID %in% dont_impute,list(n_missing = sum(is.na(value))),
                   by=.(feature_ID)]
      # remove these features from the data entirely since this affects so few features and would otherwise complicate downstream analyses to conditionally impute values 
      curr_data[,dont_impute] = NULL
      writeLines(sprintf("The following features in %s %s have more than one missing value per sex_group and are being excluded: %s", panel, tissue, paste0(dont_impute, collapse=', ')))
      sums = sums[order(n_missing_per_group, decreasing=T)]
      print(sums[feature_ID %in% dont_impute])
      filtered=T
    }
    
    if(filtered){
        pheatmap(t(curr_data),
           cluster_rows=F,
           cluster_cols=F,
           scale='row',
           na_col = 'black',
           main = sprintf('%s %s with missing values, filtered', panel, tissue),
           annotation_col = meta_df,
           annotation_colors = colors)
    }

    # impute with KNN
    k = 5
    
    meta_df = data.frame(curr_meta[,.(vial_label, sex, group)])
    rownames(meta_df) = meta_df$vial_label
    meta_df$vial_label = NULL
    colors = list(sex = sex_cols[names(sex_cols) %in% meta_df$sex], 
                  group = group_cols[names(group_cols) %in% meta_df$group])
    
    imputed = impute.knn(as.matrix(curr_data), k=k, rowmax=0.3, colmax=0.5) # analyte x sample
    pheatmap(t(imputed$data),
             cluster_rows=F,
             cluster_cols=F,
             scale='row',
             na_col = 'black',
             main = sprintf('%s %s filtered and imputed (k=%s)', panel, tissue, k),
             annotation_col = meta_df,
             annotation_colors = colors)
    
    # replace data
    imputed_data[[panel]][[tissue]]$mfi_log2_filt_imputed = as.data.frame(imputed$data)
    new_meta = as.data.frame(curr_meta, row.names=curr_meta[,vial_label]) # why isn't row.names working?
    rownames(new_meta) = new_meta$vial_label
    imputed_data[[panel]][[tissue]]$metadata = new_meta
    
  }
}
## N missing in ADIPONECTIN t31-plasma: 0
## N missing in ADIPONECTIN t55-gastrocnemius: 0
## N missing in ADIPONECTIN t69-brown-adipose: 0
## N missing in ADIPONECTIN t70-white-adipose: 0
## N missing in SERPIN-E t31-plasma: 0
## N missing in SERPIN-E t55-gastrocnemius: 0
## N missing in SERPIN-E t69-brown-adipose: 0
## N missing in SERPIN-E t70-white-adipose: 0
## N missing in rat-mag27plex t31-plasma: 0
## N missing in rat-mag27plex t52-hippocampus: 0
## N missing in rat-mag27plex t53-cortex: 0
## N missing in rat-mag27plex t55-gastrocnemius: 2

## N missing in rat-mag27plex t56-vastus-lateralis: 0
## N missing in rat-mag27plex t58-heart: 0
## N missing in rat-mag27plex t59-kidney: 0
## N missing in rat-mag27plex t61-colon: 148

## The following features in rat-mag27plex t61-colon have more than one missing value per sex_group and are being excluded: LEPTIN, IL1B, IP10, IL10
##     feature_ID      sex_group n_missing_per_group
##  1:     LEPTIN        male_1w                   3
##  2:     LEPTIN        male_4w                   2
##  3:       IL1B        male_1w                   2
##  4:       IP10        male_1w                   2
##  5:       IL10        male_1w                   2
##  6:     LEPTIN      female_1w                   2
##  7:       IL1B      female_1w                   2
##  8:       IP10      female_1w                   2
##  9:       IL10      female_1w                   2
## 10:       IL1B        male_8w                   1
## 11:       IP10        male_8w                   1
## 12:     LEPTIN        male_8w                   1
## 13:       IL10        male_8w                   1
## 14:     LEPTIN   male_control                   1
## 15:       IL10   male_control                   1
## 16:       IP10   male_control                   1
## 17:       IL1B   male_control                   1
## 18:     LEPTIN female_control                   1
## 19:       IL10 female_control                   1
## 20:       IL1B female_control                   1
## 21:       IP10 female_control                   1
## 22:       IL1B        male_4w                   1
## 23:       IL10        male_4w                   1
## 24:       IP10        male_4w                   1
## 25:     LEPTIN      female_4w                   1
## 26:       IL10        male_2w                   1
## 27:       IL1B        male_2w                   1
## 28:     LEPTIN        male_2w                   1
## 29:       IP10        male_2w                   1
##     feature_ID      sex_group n_missing_per_group

## N missing in rat-mag27plex t62-spleen: 8

## The following features in rat-mag27plex t62-spleen have more than one missing value per sex_group and are being excluded: IL4
##    feature_ID sex_group n_missing_per_group
## 1:        IL4   male_1w                   2
## 2:        IL4 female_8w                   1
## 3:        IL4 female_4w                   1

## N missing in rat-mag27plex t63-testes: 0
## N missing in rat-mag27plex t66-lung: 10

## N missing in rat-mag27plex t67-small-intestine: 0
## N missing in rat-mag27plex t68-liver: 0
## N missing in rat-mag27plex t69-brown-adipose: 0
## N missing in rat-mag27plex t70-white-adipose: 1

## N missing in rat-mag27plex t60-adrenal: 0
## N missing in rat-mag27plex t64-ovaries: 0
## N missing in rat-metabolic t31-plasma: 0
## N missing in rat-metabolic t55-gastrocnemius: 4

## N missing in rat-metabolic t61-colon: 0
## N missing in rat-metabolic t67-small-intestine: 0
## N missing in rat-metabolic t68-liver: 0
## N missing in rat-metabolic t69-brown-adipose: 0
## N missing in rat-metabolic t70-white-adipose: 2

## N missing in rat-metabolic t60-adrenal: 0
## N missing in rat-myokine t31-plasma: 0
## N missing in rat-myokine t52-hippocampus: 0
## N missing in rat-myokine t53-cortex: 0
## N missing in rat-myokine t55-gastrocnemius: 2

## N missing in rat-myokine t56-vastus-lateralis: 0
## N missing in rat-myokine t58-heart: 0
## N missing in rat-myokine t63-testes: 0
## N missing in rat-myokine t69-brown-adipose: 0
## N missing in rat-myokine t70-white-adipose: 1

## N missing in rat-myokine t64-ovaries: 0
## N missing in rat-pituitary t31-plasma: 0
## N missing in rat-pituitary t52-hippocampus: 0
## N missing in rat-pituitary t53-cortex: 0
## N missing in rat-pituitary t63-testes: 0
## N missing in rat-pituitary t60-adrenal: 0
## N missing in rat-pituitary t64-ovaries: 0
removed_samples = rbindlist(removed_samples)
writeLines("Samples removed due to >50% missing values:")
## Samples removed due to >50% missing values:
removed_samples
##     vial_label tissue_code    panel_name fraction_missing
## 1: 90245016105   t61-colon rat-mag27plex        0.6296296
## 2: 90232016105   t61-colon rat-mag27plex        0.5555556
## 3: 90439016105   t61-colon rat-mag27plex        0.7037037
## 4: 90449016105   t61-colon rat-mag27plex        0.8148148
## 5: 90292016105   t61-colon rat-mag27plex        0.7037037

Identify analyte-specific outliers

Here we define outliers as samples with a measurement more than 4 standard deviations from the mean for a specific panel, tissue, and analyte. Replace outliers with NA for differential analysis.

outlier_report_list = list()
i=1
for(panel in names(imputed_data)){
  for(tissue in names(imputed_data[[panel]])){
    meta = imputed_data[[panel]][[tissue]]$metadata
    mfi = imputed_data[[panel]][[tissue]]$mfi_log2_filt_imputed
    mfi_outliers_removed = copy(mfi)
    # for each analyte, check for outliers 
    for(analyte in colnames(mfi)){
      curr_mfi = mfi[analyte]
      curr_data = merge(curr_mfi, meta, by="row.names")
      assert(nrow(curr_data) == nrow(curr_mfi))

      values = curr_data[,analyte]
      names(values) = curr_data$vial_label
      
      # are any values outside of 3 sd of the mean?
      curr_mean = mean(values, na.rm=T)
      curr_sd = sd(values, na.rm=T)
      zscores = unlist(lapply(values, function(x){
        (x-curr_mean)/curr_sd
      }))
      zscore_outliers = zscores[abs(zscores) > 4]
      if(length(zscore_outliers) > 0){
        outlier_report_list[[i]] = data.table(viallabel = names(zscore_outliers),
                                              panel = panel, 
                                              tissue_code = tissue,
                                              feature_ID = analyte, 
                                              zscore = unname(zscore_outliers))
        # replace outliers with NA
        for(v in names(zscore_outliers)){
          mfi_outliers_removed[v, analyte] = NA_real_
        }
        
        i = i+1
      }
    }
    # save table with outliers removed 
    imputed_data[[panel]][[tissue]]$mfi_log2_filt_imputed_na_outliers = mfi_outliers_removed
  }
}

outlier_report = rbindlist(outlier_report_list)
outlier_report
##       viallabel         panel          tissue_code     feature_ID    zscore
##  1: 90266016909   ADIPONECTIN    t69-brown-adipose    ADIPONECTIN -4.444743
##  2: 90560016909      SERPIN-E    t69-brown-adipose PAI-1/SERPIN-E  4.011948
##  3: 90587013109 rat-mag27plex           t31-plasma          MIP1A  4.193492
##  4: 90587013109 rat-mag27plex           t31-plasma            IL4  4.246215
##  5: 90449013109 rat-mag27plex           t31-plasma           IL1B  4.309305
##  6: 90587013109 rat-mag27plex           t31-plasma            IL2  4.059646
##  7: 90587013109 rat-mag27plex           t31-plasma           IL18  4.405682
##  8: 90587013109 rat-mag27plex           t31-plasma           VEGF  4.700419
##  9: 90587013109 rat-mag27plex           t31-plasma    FRACTALKINE  4.368671
## 10: 90245015515 rat-mag27plex    t55-gastrocnemius           IL1A -4.025528
## 11: 90266015515 rat-mag27plex    t55-gastrocnemius             KC  4.145158
## 12: 90585015606 rat-mag27plex t56-vastus-lateralis           IL18  4.037749
## 13: 90232016205 rat-mag27plex           t62-spleen            EGF  4.866053
## 14: 90225016705 rat-mag27plex  t67-small-intestine           MCP1  4.325620
## 15: 90225016705 rat-mag27plex  t67-small-intestine           TNFA  4.364519
## 16: 90222016812 rat-mag27plex            t68-liver         LEPTIN -4.025471
## 17: 90266016909 rat-mag27plex    t69-brown-adipose          MIP1A  4.710942
## 18: 90241016909 rat-mag27plex    t69-brown-adipose            EGF  4.471512
## 19: 90266016909 rat-mag27plex    t69-brown-adipose           IL18  4.082529
## 20: 90217017013 rat-mag27plex    t70-white-adipose            EGF  4.733428
## 21: 90439016005 rat-mag27plex          t60-adrenal            EGF  4.040440
## 22: 90225016705 rat-metabolic  t67-small-intestine      C Peptide  4.448006
## 23: 90225016705 rat-metabolic  t67-small-intestine           MCP1  4.340224
## 24: 90441016705 rat-metabolic  t67-small-intestine             PP  4.693731
## 25: 90222016812 rat-metabolic            t68-liver         LEPTIN -4.244638
## 26: 90560016909 rat-metabolic    t69-brown-adipose      C Peptide  4.748961
## 27: 90560016909 rat-metabolic    t69-brown-adipose        INSULIN  5.261865
## 28: 90426016005 rat-metabolic          t60-adrenal        GHRELIN  4.091739
## 29: 90232015209 rat-pituitary      t52-hippocampus             LH  4.547084
## 30: 90229015306 rat-pituitary           t53-cortex            TSH  4.694970
##       viallabel         panel          tissue_code     feature_ID    zscore
# one plot per data set
for(panel in names(imputed_data)){
  for(tissue in names(imputed_data[[panel]])){
    meta = imputed_data[[panel]][[tissue]]$metadata
    mfi = imputed_data[[panel]][[tissue]]$mfi_log2_filt_imputed
    mfi = reshape2::melt(as.matrix(mfi))
    colnames(mfi) = c('vial_label','feature_ID','value')
    curr_data = merge(mfi, meta, by="vial_label")
    curr_data = merge(curr_data, outlier_report, 
                      by.x = c('vial_label','panel_name','tissue_code','feature_ID'),
                      by.y = c('viallabel','panel','tissue_code','feature_ID'),
                      all.x=T)
    curr_data$is_outlier = ifelse(is.na(curr_data$zscore), 'none', 'z')
    
    label_data = curr_data[curr_data$is_outlier == 'z',]

    # make a boxplot of all values across analytes, highlighting outliers 
    g = ggplot(curr_data, aes(x=feature_ID, y=value)) +
      geom_boxplot(outlier.shape=NA) +
      geom_point(data = curr_data[curr_data$is_outlier == 'z',],
               colour='red', aes(fill=group, shape=sex), size=2) +
      geom_point(data = curr_data[curr_data$is_outlier == 'none',],
                 position=position_jitter(width = 0.2, height = 0), 
                 aes(fill=group, shape=sex),size=1) +
      theme_classic() +
      theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1, colour='black'),
            axis.title.x = element_blank()) +
      scale_fill_manual(values=group_cols) +
      scale_shape_manual(values=c(female=21, male=24)) +
      labs(title=sprintf("%s %s", panel, tissue)) +
      geom_text_repel(data=label_data, aes(label=bid), hjust=0, vjust=0.5, size=2, direction='y') +
      guides(fill=guide_legend(override.aes = list(shape=21, size=1.5)),
             shape=guide_legend(override.aes = list(colour='black', size=1.5)))
    print(g)
  }
}

for(panel in names(imputed_data)){
  
  ## submitted mfi
  # write one file per tissue 
  all_mfi = copy(mfi_list[[panel]])
  all_mfi[,tissue_code := sapply(vial_label, function(x){
    splits = unname(unlist(strsplit(as.character(x), "")))
    tissue_code = paste0(splits[8:9], collapse='')
    if(tissue_code=="NANA"){return(NA_character_)}
    return(paste0("T",tissue_code))
  })]
  for(t in unique(all_mfi[,tissue_code])){
    if(is.na(t)){next}
    tissue_name = bic_animal_tissue_code$tissue_name_release[bic_animal_tissue_code$bic_tissue_code==t]
    # subset
    curr_mfi = all_mfi[tissue_code==t]
    curr_mfi[,c("panel_name","plate_id","luminex_sample_name","tissue_code") := NULL]
    # write 
    write.table(curr_mfi, file=sprintf("pass1b-06_%s_immunoassay_%s_mfi.txt", tissue_name, panel), 
                sep='\t', col.names=T, row.names=F, quote=F)
  }
  

  all = imputed_data[[panel]]
  for(tissue in names(all)){
    
    ## get CHEX
    chex = mfi_list[[panel]][,.(vial_label, CHEX1, CHEX2, CHEX3, CHEX4)]
    chex_cols = c('CHEX1','CHEX2','CHEX3','CHEX4')
    chex[,(chex_cols) := lapply(.SD, log2), .SDcols = chex_cols]
    
    ## mfi_log2_filt_imputed
    curr = copy(all[[tissue]]$mfi_log2_filt_imputed)
    curr = cbind(vial_label=rownames(curr), curr)
    curr_chex = merge(curr, chex, by='vial_label')
    assert(nrow(curr_chex) == nrow(curr))
    write.table(curr_chex, file=sprintf("pass1b-06_%s_immunoassay_%s_mfi-log2-filt-imputed.txt", tissue, panel), 
                sep='\t', col.names=T, row.names=F, quote=F)
    
    ## mfi_log2_filt_imputed_na_outliers
    curr = copy(all[[tissue]]$mfi_log2_filt_imputed_na_outliers)
    curr = cbind(vial_label=rownames(curr), curr)
    curr_chex = merge(curr, chex, by='vial_label')
    assert(nrow(curr_chex) == nrow(curr))
    write.table(curr_chex, file=sprintf("pass1b-06_%s_immunoassay_%s_mfi-log2-filt-imputed-na-outliers.txt", tissue, panel), 
                sep='\t', col.names=T, row.names=F, quote=F)
  }
}

# copy to MoTrPAC's GCP' bucket 
system("gsutil -m cp *txt ADDPATH")